package au.com.acpfg.misc.biojava;
import java.util.HashSet;
import org.biojava.bio.seq.DNATools;
import org.biojava.bio.seq.RNATools;
import org.biojava.bio.symbol.SymbolList;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataRow;
import org.knime.core.data.DataType;
import org.knime.core.data.RowIterator;
import org.knime.core.data.def.DefaultRow;
import org.knime.core.data.def.StringCell;
import org.knime.core.node.BufferedDataContainer;
import org.knime.core.node.BufferedDataTable;
import org.knime.core.node.ExecutionContext;
import org.knime.core.node.NodeLogger;
/**
* This class differs from the superclass in terms of processing at the DNA/RNA level where it
* is more persistent to try to find suitable peptides and supports alternate initiation codons,
* unlike the superclass.
*
* @author andrew.cassin
*
*/
public class TrypticPeptideExtractor_v2 extends TrypticPeptideExtractor {
/**
* Contains all the unique RNA codons which can code for the tryptic terminii (K and/or R amino acids)
*/
private final HashSet<String> terminii_codons = new HashSet<String>();
/**
* Since trypsin almost always does not cleave after a [KR] immediately followed by Proline (P) we
* store the proline codons in a hash set in order to model this behaviour
*/
private final HashSet<String> proline_codons = new HashSet<String>();
/**
* Initialises the private state for this class and invokes the superclass constructor
*
* @param m
* @param task
*/
public TrypticPeptideExtractor_v2(BioJavaProcessorNodeModel m, String task) {
super(m, task);
terminii_codons.add("AAG"); // K
terminii_codons.add("AAA");
terminii_codons.add("CGA"); // R
terminii_codons.add("CGC");
terminii_codons.add("CGG");
terminii_codons.add("CGU");
terminii_codons.add("AGA");
terminii_codons.add("AGG");
proline_codons.add("CCC"); // proline (p)
proline_codons.add("CCG");
proline_codons.add("CCA");
proline_codons.add("CCU");
}
/**
* Replaces the superclass implementation with an implementation that works directly at the
* RNA level and handles IUPAC ambiguity with codons essential to tryptic peptides eg. K/R. In this
* case the algorithm emits all possible peptides which are fully tryptic.
*/
@Override
public void execute(BioJavaProcessorNodeModel mdl, ExecutionContext exec,
NodeLogger l, BufferedDataTable[] inData, BufferedDataContainer c)
throws Exception {
/* only call the superclass for protein sequences, no IUPAC coding in this case */
if (mdl.areSequencesProtein()) {
super.execute(mdl, exec, l, inData, c);
return;
}
// else...
RowIterator it = inData[0].iterator();
double done = 0.0;
int n_rows = inData[0].getRowCount();
while (it.hasNext()) {
DataRow r = it.next();
String[] na_seqs = new String[6]; // RNA reading frames if !is_prot (MUST be in the same frame order as seqs)
String str = mdl.getSequence(r);
if (str == null || str.length() < 1)
continue;
SymbolList syms = mdl.getSequenceAsSymbol(str);
boolean is_dna = mdl.areSequencesDNA();
for (int i=0; i<3; i++) {
// take the reading frame
SymbolList rf = syms.subList(i+1, syms.length()-(syms.length() - i) % 3);
// if it is DNA transcribe it to RNA first
if (is_dna) {
rf = DNATools.toRNA(rf);
}
na_seqs[i+3] = rf.seqString().toUpperCase();
// reverse frame translation
rf = RNATools.reverseComplement(rf);
na_seqs[i] = rf.seqString().toUpperCase();
}
// process the available protein sequences looking for tryptic peptides
DataCell[] cells = new DataCell[5];
cells[0] = new StringCell(mdl.getSequence(r));
cells[1] = DataType.getMissingCell();
cells[2] = DataType.getMissingCell();
cells[3] = DataType.getMissingCell();
cells[4] = DataType.getMissingCell();
// unambiguous tryptics
HashSet<String> unambg_tryptics = new HashSet<String>();
for (String seq : na_seqs) {
for (int i=0; i<seq.length(); i+= 3) {
if (can_code_for(seq.substring(i, i+3), terminii_codons)) {
int len = next_tryptic_terminii(seq.substring(i+3), terminii_codons);
if (len > 0) {
// HACK: superclass uses DNA Thiamine (which is wrong) in codon table
// rather than RNA Uracil, so we make sure this wont throw
String codon_seq = seq.substring(i, i+(6+len*3)).replaceAll("U", "T");
int codon_len = codon_seq.length();
recurse(unambg_tryptics, new StringBuffer(codon_seq), 0, codon_len);
}
}
}
}
if (unambg_tryptics.size() > 0) {
cells[1] = populate_cell(unambg_tryptics, true); // true == omit peptides with stop codons in results
}
c.addRowToTable(new DefaultRow(r.getKey(), cells));
done++;
if (((int)done) % 100 == 0) {
exec.checkCanceled();
exec.setProgress(done / n_rows);
}
}
}
private int next_tryptic_terminii(String codons, HashSet<String> terminii_codons) {
for (int n_codons=6; n_codons<(TrypticPeptideExtractor.MAX_MEASURABLE_PEPTIDE_LENGTH_IN_AA-2); n_codons++) {
int offset = n_codons*3;
if (offset+3 > codons.length())
return -1;
if (can_code_for(codons.substring(offset, offset+3), terminii_codons)) {
// check for proline after K/R and reject if we find one, otherwise accept it
offset += 3;
if (offset+3 > codons.length()) {
// accept an end-of-sequence as not having a proline
return n_codons;
}
if (can_code_for(codons.substring(offset, offset+3), proline_codons)) {
return -1;
}
return n_codons;
}
}
return -1; // not found
}
/**
* Determines if the specified codon (eg. AAC) can code for any of the <code>wanted_codons</code>
* taking into account any IUPAC codes in the specified codon - this routine supports any base
* being called as ambiguous (or all of them) and will test all permutations
*
* @param codon
* @param wanted_codons
* @return returns true if the specified codon satisfies the wanted codons, false otherwise
*/
private boolean can_code_for(String codon, HashSet<String> wanted_codons) {
// if the codon has no ambiguity and codes for a wanted codon then we are done
if (wanted_codons.contains(codon.toUpperCase())) {
return true;
}
// ambiguous base call(s) in codon, but *could potentially* code for one of the wanted codons?
assert(codon != null && codon.length() == 3);
for (int i=0; i<3; i++) {
char c = codon.charAt(i);
if (c != 'A' && c != 'C' && c != 'G' && c != 'U') {
return ok_recursive(0, new StringBuffer(codon), wanted_codons);
}
}
// all failed...
return false;
}
/**
* A helper function to <code>can_code_for()</code> this resolves the ambiguity and once
* resolved, tests the resulting codons against the desired <code>wanted_codons</code>
*
* @param idx
* @param na_codon
* @param wanted_codons
* @return true if the specified codon matches any of the wanted codons, false otherwise
*/
private boolean ok_recursive(int idx, StringBuffer na_codon, HashSet<String> wanted_codons) {
if (idx >= 3) {
return (wanted_codons.contains(na_codon));
}
char c = na_codon.charAt(idx);
if (c == 'R') {
na_codon.setCharAt(idx, 'A');
if (ok_recursive(idx+1, na_codon, wanted_codons))
return true;
na_codon.setCharAt(idx, 'G');
return ok_recursive(idx, na_codon, wanted_codons);
} else if (c == 'Y') {
na_codon.setCharAt(idx, 'C');
if (ok_recursive(idx, na_codon, wanted_codons))
return true;
na_codon.setCharAt(idx, 'U');
return ok_recursive(idx, na_codon, wanted_codons);
// no need to worry about T
} else if (c == 'M') {
na_codon.setCharAt(idx, 'C');
if (ok_recursive(idx, na_codon, wanted_codons))
return true;
na_codon.setCharAt(idx, 'A');
return ok_recursive(idx, na_codon, wanted_codons);
} else if (c == 'K') {
na_codon.setCharAt(idx, 'U');
if (ok_recursive(idx, na_codon, wanted_codons)) {
return true;
}
na_codon.setCharAt(idx, 'G');
return ok_recursive(idx, na_codon, wanted_codons);
// no need to worry about U
} else if (c == 'W') {
na_codon.setCharAt(idx, 'U');
if (ok_recursive(idx, na_codon, wanted_codons)) {
return true;
}
na_codon.setCharAt(idx, 'A');
return ok_recursive(idx, na_codon, wanted_codons);
// no need to worry about T
} else if (c == 'S') {
na_codon.setCharAt(idx, 'C');
if (ok_recursive(idx, na_codon, wanted_codons)) {
return true;
}
na_codon.setCharAt(idx, 'G');
return ok_recursive(idx, na_codon, wanted_codons);
} else if (c == 'B') {
na_codon.setCharAt(idx, 'U');
if (ok_recursive(idx, na_codon, wanted_codons)) {
return true;
}
na_codon.setCharAt(idx, 'C');
if (ok_recursive(idx, na_codon, wanted_codons)) {
return true;
}
na_codon.setCharAt(idx, 'G');
return ok_recursive(idx, na_codon, wanted_codons);
// no need to worry about T
} else if (c == 'D') {
na_codon.setCharAt(idx, 'A');
if (ok_recursive(idx, na_codon, wanted_codons)) {
return true;
}
na_codon.setCharAt(idx, 'T');
if (ok_recursive(idx, na_codon, wanted_codons)) {
return true;
}
na_codon.setCharAt(idx, 'G');
return ok_recursive(idx, na_codon, wanted_codons);
// no need to worry about U
} else if (c == 'H') {
na_codon.setCharAt(idx, 'A');
if (ok_recursive(idx, na_codon, wanted_codons)) {
return true;
}
na_codon.setCharAt(idx, 'U');
if (ok_recursive(idx, na_codon, wanted_codons)) {
return true;
}
na_codon.setCharAt(idx, 'C');
return ok_recursive(idx, na_codon, wanted_codons);
// no need to worry about T
} else if (c == 'V') {
na_codon.setCharAt(idx, 'A');
if (ok_recursive(idx, na_codon, wanted_codons)) {
return true;
}
na_codon.setCharAt(idx, 'C');
if (ok_recursive(idx, na_codon, wanted_codons)) {
return true;
}
na_codon.setCharAt(idx, 'G');
return ok_recursive(idx, na_codon, wanted_codons);
// no need to worry about U
} else if (c == 'N') {
for (char c2 : new char[] { 'A', 'C', 'G', 'U'} ) {
na_codon.setCharAt(idx, c2);
if (ok_recursive(idx, na_codon, wanted_codons))
return true;
}
return false;
}
// else
return ok_recursive(idx+1, na_codon, wanted_codons);
}
}